# Covariate Processing Functions  
# Alex F. Gazmararian
# agazmararian@gmail.com

#' Standardize values (z-score)
#' @param x Numeric vector to standardize
#' @return Numeric vector with mean 0 and sd 1
zfun <- function(x) {
  m <- mean(x, na.rm = TRUE)
  s <- sd(x, na.rm = TRUE)
  return((x - m) / s)
}

#' Validate district numbers
#' @param district Numeric vector of district numbers
#' @return Logical vector indicating valid districts
validate_district <- function(district) {
    if (is.na(district)) return(FALSE)
    if (!is.numeric(district) || district < 1) {
        return(FALSE)
    }
    return(TRUE)
}

#' Clean rating text by removing specified patterns
#' @param rating_text Character vector of rating descriptions
#' @param ... Character patterns to remove
#' @return Character vector of cleaned ratings
clean_rating_text <- function(rating_text, ...) {
  removal_patterns <- c(...)
  pattern_collapse <- paste0("\\b", c(...), "\\b", collapse = "|")
  
  stringr::str_remove_all(rating_text, pattern_collapse) %>%
    stringr::str_trim() %>%
    stringr::str_squish()
}

#' Validate election coverage across years
#' @param data Data frame with election data
#' @param stage_name Character string describing validation stage
validate_election_coverage <- function(data, stage_name) {
  message("=== Election Coverage Validation: ", stage_name, " ===")
  
  if (!"year" %in% names(data)) {
    warning("No 'year' column found in data")
    return(invisible(data))
  }
  
  year_coverage <- data %>%
    dplyr::count(year, sort = TRUE)
  
  message("Election years covered:")
  for (i in seq_len(nrow(year_coverage))) {
    message(sprintf("  %d: %d races", year_coverage$year[i], year_coverage$n[i]))
  }
  
  # Check for missing even years (typical election schedule)
  election_years <- seq(min(data$year, na.rm = TRUE), max(data$year, na.rm = TRUE), by = 2)
  missing_years <- setdiff(election_years, data$year)
  
  if (length(missing_years) > 0) {
    warning(sprintf("Missing election years: %s", paste(missing_years, collapse = ", ")))
  }
  
  invisible(data)
}

#' Validate vote margin calculations
#' @param data Data frame with vote margin data
#' @param stage_name Character string describing validation stage
validate_vote_margins <- function(data, stage_name) {
  message("=== Vote Margins Validation: ", stage_name, " ===")
  
  # Check for required columns
  required_cols <- c("vote_share", "margin")
  missing_cols <- setdiff(required_cols, names(data))
  
  if (length(missing_cols) > 0) {
    warning(sprintf("Missing required columns: %s", paste(missing_cols, collapse = ", ")))
    return(invisible(data))
  }
  
  # Validate vote shares are between 0 and 1
  invalid_shares <- sum(data$vote_share < 0 | data$vote_share > 1, na.rm = TRUE)
  if (invalid_shares > 0) {
    warning(sprintf("%d observations have vote shares outside [0,1]", invalid_shares))
  }
  
  # Validate margins
  invalid_margins <- sum(abs(data$margin) > 1, na.rm = TRUE)
  if (invalid_margins > 0) {
    warning(sprintf("%d observations have absolute margins > 1", invalid_margins))
  }
  
  # Summary statistics
  message(sprintf("Mean vote share: %.3f", mean(data$vote_share, na.rm = TRUE)))
  message(sprintf("Mean absolute margin: %.3f", mean(abs(data$margin), na.rm = TRUE)))
  message(sprintf("Competitive races (margin < 0.1): %d", 
                 sum(abs(data$margin) < 0.1, na.rm = TRUE)))
  
  invisible(data)
}
